library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)
# Update gene scores to new SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Dataset created with 20_04_07_create_dataset.html (change SFARI scores to Old SFARI scores)
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% dplyr::select(-gene.score) %>% mutate(ID = rownames_dataset) %>%
left_join(SFARI_genes %>% dplyr::select(ID, `gene-score`), by = 'ID') %>%
mutate(Module = clusterings$Module,
gene.score = ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
dplyr::select(-matches(clustering_selected), -ID, -`gene-score`)
rownames(dataset) = rownames_dataset
# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
for(g in GS_missing){
dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
}
rm(datExpr, datGenes, datMeta, dds, DE_info, g, GS_missing)
}
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)
rm(getinfo, mart, rownames_dataset, GO_annotations)
The pipeline to prepare the dataset is the same as in 10_classification_model.html, so I won’t repeat the explanation here
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>%
dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']
original_dataset = dataset
dataset = new_dataset
rm(rm_cluster, new_dataset)
The final dataset contains 15994 observations (genes) and 59 variables
The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups
The position of the SFARI label almost doesn’t change
pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))
mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')
plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)
# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))
# PCA
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2],
'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')
p = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
ggExtra::ggMarginal(p, type='density', groupColour=TRUE, size=10)
rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p)
Using the same method explained in 10_classification_model.html
The only difference is that we now have a slightly larger proportion of positive observations than before (5.6%)
table_info = dataset %>% apply_labels(SFARI = 'SFARI')
cro(table_info$SFARI)
| #Total | |
|---|---|
| SFARI | |
| FALSE | 15099 |
| TRUE | 895 |
| #Total cases | 15994 |
rm(table_info)
set.seed(123)
sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score),
by = 'ID') %>%
mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))
train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]
rm(sample_scores, train_idx)
# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5
train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
smote_over_sampling = trainControl(method = 'repeatedcv', number = 10, repeats = 10, verboseIter = FALSE,
classProbs = TRUE, savePredictions = 'final',
summaryFunction = twoClassSummary, sampling = 'smote')
# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
trControl = smote_over_sampling)
#a = caret::thresholder(fit, threshold = seq(0.05, 0.95, by = 0.1), statistics = 'all')
rm(smote_over_sampling)
The model has an AUC of 0.6664, but, as before, the features are strongly correlated
# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
mutate(outlier = VIF>10)
plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') +
scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') +
xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))
rm(plot_data)
Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal
Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study
Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression
Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again
Notes:
Running the model multiple times to get more acurate measurements of its performance
Over-sampling positive samples in the training set to obtain a 1:1 class ratio using SMOTE
### DEFINE FUNCTIONS
create_train_test_sets = function(p, seed){
# Get SFARI Score of all the samples so our train and test sets are balanced for each score
sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score),
by = 'ID') %>%
mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))
set.seed(seed)
train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]
return(list('train_set' = train_set, 'test_set' = test_set))
}
run_model = function(p, over_sampling_fold, seed){
# Create train and test sets
train_test_sets = create_train_test_sets(p, seed)
train_set = train_test_sets[['train_set']]
test_set = train_test_sets[['test_set']]
# Train Model
train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
lambda_seq = 10^seq(1, -4, by = -.1)
set.seed(seed)
smote_over_sampling = trainControl(method = 'repeatedcv', number = over_sampling_fold, repeats = 10,
verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final',
summaryFunction = twoClassSummary, sampling = 'smote')
fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
# Predict labels in test set
predictions = fit %>% predict(test_set, type = 'prob')
preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)
# Measure performance of the model
acc = mean(test_set$SFARI==preds$pred)
prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
pred_ROCR = prediction(preds$prob, test_set$SFARI)
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
# Extract coefficients from features
coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1,
'AUC' = AUC, 'preds' = preds, 'coefs' = coefs))
}
### RUN MODEL
# Parameters
p = 0.75
over_sampling_fold = 10
n_iter = 25
seeds = 123:(123+n_iter-1)
# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)
for(seed in seeds){
# Run model
model_output = run_model(p, over_sampling_fold, seed)
# Update outputs
acc = c(acc, model_output[['acc']])
prec = c(prec, model_output[['prec']])
rec = c(rec, model_output[['rec']])
F1 = c(F1, model_output[['F1']])
AUC = c(AUC, model_output[['AUC']])
preds = model_output[['preds']]
coefs$coef = coefs$coef + model_output[['coefs']]
update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in%
preds$ID, c('prob','pred','n')] +
update_preds
}
coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)
rm(p, over_sampling_fold, seeds, update_preds, create_train_test_sets, run_model)
To summarise in a single value the predictions of the models:
prob = Average of all the probabilities
pred = 1 if prob>0.5, 0 otherwise
test_set = predictions %>% filter(n>0) %>%
left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]
conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels',
prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(conf_mat$SFARI, list(conf_mat$pred, total()))
| Label Prediction | #Total | |||
|---|---|---|---|---|
| FALSE | TRUE | |||
| Actual Labels | ||||
| FALSE | 11905 | 3181 | 15086 | |
| TRUE | 484 | 411 | 895 | |
| #Total cases | 12389 | 3592 | 15981 | |
rm(conf_mat)
pred_ROCR = prediction(test_set$prob, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)
MTcor, GS and absGS have a very small coefficient
gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>%
left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))
coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>%
left_join(gene_corr_info, by = c('feature' = 'Module')) %>%
dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>%
summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))
coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>%
kable(align = 'cc', caption = 'Regression Coefficients') %>% kable_styling(full_width = F)
| Feature | Coefficient |
|---|---|
| MTcor | 0.0001566 |
| absGS | -0.0372042 |
| GS | -0.1208186 |
| Intercept | -0.5215226 |
load('./../Data/ORA.RData')
enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
m_info = enrichment_old_SFARI[[m]]
enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
enrichment_SFARI_info = enrichment_SFARI_info %>%
add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}
plot_data = coef_info %>% dplyr::rename('Module' = feature) %>%
left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))
ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) +
geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) +
geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) +
theme_minimal() + xlab('Coefficient') +
ylab('SFARI Genes Enrichment'))
rm(enrichment_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
m, m_info, enrichment)
This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
ggplot(aes(coef, MTcor)) + geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) +
geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]),
alpha=0.7) +
theme_minimal() + xlab('Coefficient') +
ylab('Module-Diagnosis correlation'))
Note: All of the conclusions from this section are the same found with the new SFARI Genes in 10_classification_model.html
SFARI genes have a higher score distribution than the rest, but the overlap is large
plot_data = test_set %>% dplyr::select(prob, SFARI)
ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype='dashed') +
geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype='dashed') +
theme_minimal() + ggtitle('Model score distribution by SFARI Label'))
Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be positive relation between the SFARI scores and the probability assigned by the model. It’s not significant for adjacent scores, but it usually is for more distant scores
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'),
gene.score)) %>%
dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')
cro(plot_data$gene.score)
| #Total | |
|---|---|
| SFARI Gene score | |
| 1 | 25 |
| 2 | 65 |
| 3 | 191 |
| 4 | 427 |
| 5 | 164 |
| 6 | 23 |
| Neuronal | 906 |
| Others | 14180 |
| #Total cases | 15981 |
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))
comparisons = list(c('1','2'), c('2','3'), c('3','4'), c('4','5'), c('5','6'), c('6','Neuronal'), c('Neuronal','Others'),
c('1','3'), c('3','5'), c('5','Neuronal'),
c('2','4'), c('4','6'), c('6','Others'),
c('1','4'), c('4','Neuronal'),
c('2','5'), c('5','Others'),
c('3','Neuronal'), c('1','5'), c('2','6'), c('3','Neuronal'), c('4','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 7), rep(base + increase,3), rep(base + 2*increase,3), rep(base + 3*increase, 2),
rep(base + 4*increase, 2), base + 5:9*increase)
plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons,
tip.length = .02) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')
rm(mean_vals, increase, base, pos_y_comparisons)
Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:3)
High concentration of genes with a SFARI Score of 1, specially at the top
test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'),
gene.score)) %>%
left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
'GeneSignificance' = GS) %>%
mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4),
GeneSignificance = round(GeneSignificance,4)) %>%
dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
gene.score) %>%
kable(caption = 'Genes with highest model probabilities from the test set') %>%
kable_styling(full_width = F)
| GeneSymbol | GeneSignificance | ModuleDiagnosis_corr | Module | Probability | gene.score |
|---|---|---|---|---|---|
| TANC2 | -0.2362 | -0.5573 | #00C0BF | 0.8555 | 3 |
| MYCBP2 | -0.3960 | -0.3847 | #00B7E7 | 0.8463 | Neuronal |
| RIMBP2 | 0.2592 | 0.5341 | #EF67EB | 0.8432 | Others |
| RPRD2 | -0.0931 | 0.3428 | #00C08C | 0.8421 | Others |
| MBD5 | 0.1916 | 0.0721 | #96A900 | 0.8388 | 3 |
| ARID1B | 0.2675 | 0.3428 | #00C08C | 0.8380 | 1 |
| ANK2 | -0.4350 | -0.9003 | #00BADE | 0.8347 | 1 |
| PRDM2 | -0.2516 | 0.0721 | #96A900 | 0.8341 | Others |
| KCNJ6 | -0.1389 | -0.9003 | #00BADE | 0.8329 | Others |
| HUNK | -0.3276 | -0.3731 | #4BB400 | 0.8324 | Others |
| HIVEP2 | 0.0188 | -0.9003 | #00BADE | 0.8294 | Others |
| EP300 | -0.0235 | 0.3428 | #00C08C | 0.8254 | 4 |
| RFX7 | 0.1381 | 0.0721 | #96A900 | 0.8237 | Others |
| KMT2A | 0.1343 | 0.6175 | #44A0FF | 0.8188 | 1 |
| RAPH1 | -0.0554 | -0.3847 | #00B7E7 | 0.8184 | Others |
| FMNL1 | -0.2229 | -0.3847 | #00B7E7 | 0.8155 | Others |
| SRRM4 | -0.4423 | -0.5573 | #00C0BF | 0.8116 | 5 |
| ST8SIA3 | -0.1994 | -0.7233 | #FF63B6 | 0.8113 | Others |
| ZNF804A | -0.2756 | -0.3731 | #4BB400 | 0.8112 | 3 |
| GABBR2 | -0.6251 | -0.9003 | #00BADE | 0.8095 | 4 |
| TLN2 | -0.4786 | -0.7233 | #FF63B6 | 0.8073 | Others |
| C20orf112 | 0.0294 | 0.3428 | #00C08C | 0.8069 | Others |
| ETV6 | -0.1439 | -0.6508 | #E76BF3 | 0.8060 | Others |
| RASGRF1 | -0.7424 | -0.9003 | #00BADE | 0.8059 | Neuronal |
| TLE3 | 0.4865 | 0.5341 | #EF67EB | 0.8043 | Others |
| ANK3 | -0.4808 | -0.8123 | #00BECA | 0.8033 | 3 |
| NBEA | -0.4139 | -0.9003 | #00BADE | 0.8012 | 4 |
| GRIN2B | -0.2787 | -0.7233 | #FF63B6 | 0.8011 | 1 |
| CREBBP | 0.1424 | 0.3428 | #00C08C | 0.7987 | 5 |
| CACNG3 | -0.4698 | -0.6770 | #00A6FF | 0.7987 | Others |
| SIK3 | -0.1236 | -0.3847 | #00B7E7 | 0.7980 | Others |
| NRXN3 | -0.6712 | -0.9003 | #00BADE | 0.7977 | 3 |
| CNTNAP2 | -0.6999 | -0.9003 | #00BADE | 0.7969 | 2 |
| CELF2 | -0.0655 | -0.9003 | #00BADE | 0.7961 | Others |
| AAK1 | -0.6910 | -0.9003 | #00BADE | 0.7960 | Others |
| EP400 | -0.1699 | -0.5573 | #00C0BF | 0.7958 | 3 |
| KCNB1 | -0.7190 | -0.9003 | #00BADE | 0.7957 | Neuronal |
| SCAF4 | 0.0161 | -0.5573 | #00C0BF | 0.7921 | Others |
| RNF111 | -0.2393 | 0.0721 | #96A900 | 0.7915 | Others |
| PLXNC1 | -0.0093 | 0.5341 | #EF67EB | 0.7913 | Others |
| SRCAP | -0.3631 | -0.5573 | #00C0BF | 0.7912 | 2 |
| BAZ2A | 0.0518 | 0.0721 | #96A900 | 0.7908 | Others |
| GNAS | -0.5848 | -0.5573 | #00C0BF | 0.7891 | 4 |
| PCDHB13 | -0.0405 | -0.4661 | #1FB700 | 0.7891 | Others |
| GLTSCR1L | -0.1611 | 0.0721 | #96A900 | 0.7887 | Others |
| POM121C | -0.2811 | -0.5573 | #00C0BF | 0.7886 | Others |
| KMT2D | -0.3274 | -0.5573 | #00C0BF | 0.7883 | Others |
| NFAT5 | 0.1025 | 0.0721 | #96A900 | 0.7874 | Others |
| AJAP1 | -0.0441 | -0.0245 | #FE6D8D | 0.7868 | Others |
| DAAM1 | 0.1540 | -0.0127 | #00BF7D | 0.7863 | Others |
Note: All of the conclusions from this section are the same found with the new SFARI Genes in 10_classification_model.html
The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)
negative_set = test_set %>% filter(!SFARI)
negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(negative_set_table$pred)
| #Total | |
|---|---|
| Label Prediction | |
| FALSE | 11905 |
| TRUE | 3181 |
| #Total cases | 15086 |
3181 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
ggtitle('Probability distribution of the Negative samples in the Test Set') +
theme_minimal()
negative_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(negative_set)) %>%
arrange(desc(prob)) %>% top_n(50, wt = prob) %>%
left_join(original_dataset %>% mutate(ID = rownames(original_dataset)), by = 'ID') %>%
mutate(gene.score = ifelse(gene.score=='None',
ifelse(ID %in% GO_neuronal$ID,'Neuronal','Others'),
gene.score)) %>%
left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob,
'ModuleDiagnosis_corr' =MTcor, 'GeneSignificance' = GS) %>%
mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4),
Probability = round(Probability, 4),
GeneSignificance = round(GeneSignificance, 4)) %>%
dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
gene.score) %>%
kable(caption = 'Genes with highest model probabilities from the Negative set') %>%
kable_styling(full_width = F)
| GeneSymbol | GeneSignificance | ModuleDiagnosis_corr | Module | Probability | gene.score |
|---|---|---|---|---|---|
| MYCBP2 | -0.3960 | -0.3847 | #00B7E7 | 0.8463 | Neuronal |
| RIMBP2 | 0.2592 | 0.5341 | #EF67EB | 0.8432 | Others |
| RPRD2 | -0.0931 | 0.3428 | #00C08C | 0.8421 | Others |
| PRDM2 | -0.2516 | 0.0721 | #96A900 | 0.8341 | Others |
| KCNJ6 | -0.1389 | -0.9003 | #00BADE | 0.8329 | Others |
| HUNK | -0.3276 | -0.3731 | #4BB400 | 0.8324 | Others |
| HIVEP2 | 0.0188 | -0.9003 | #00BADE | 0.8294 | Others |
| RFX7 | 0.1381 | 0.0721 | #96A900 | 0.8237 | Others |
| RAPH1 | -0.0554 | -0.3847 | #00B7E7 | 0.8184 | Others |
| FMNL1 | -0.2229 | -0.3847 | #00B7E7 | 0.8155 | Others |
| ST8SIA3 | -0.1994 | -0.7233 | #FF63B6 | 0.8113 | Others |
| TLN2 | -0.4786 | -0.7233 | #FF63B6 | 0.8073 | Others |
| C20orf112 | 0.0294 | 0.3428 | #00C08C | 0.8069 | Others |
| ETV6 | -0.1439 | -0.6508 | #E76BF3 | 0.8060 | Others |
| RASGRF1 | -0.7424 | -0.9003 | #00BADE | 0.8059 | Neuronal |
| TLE3 | 0.4865 | 0.5341 | #EF67EB | 0.8043 | Others |
| CACNG3 | -0.4698 | -0.6770 | #00A6FF | 0.7987 | Others |
| SIK3 | -0.1236 | -0.3847 | #00B7E7 | 0.7980 | Others |
| CELF2 | -0.0655 | -0.9003 | #00BADE | 0.7961 | Others |
| AAK1 | -0.6910 | -0.9003 | #00BADE | 0.7960 | Others |
| KCNB1 | -0.7190 | -0.9003 | #00BADE | 0.7957 | Neuronal |
| SCAF4 | 0.0161 | -0.5573 | #00C0BF | 0.7921 | Others |
| RNF111 | -0.2393 | 0.0721 | #96A900 | 0.7915 | Others |
| PLXNC1 | -0.0093 | 0.5341 | #EF67EB | 0.7913 | Others |
| BAZ2A | 0.0518 | 0.0721 | #96A900 | 0.7908 | Others |
| PCDHB13 | -0.0405 | -0.4661 | #1FB700 | 0.7891 | Others |
| GLTSCR1L | -0.1611 | 0.0721 | #96A900 | 0.7887 | Others |
| POM121C | -0.2811 | -0.5573 | #00C0BF | 0.7886 | Others |
| KMT2D | -0.3274 | -0.5573 | #00C0BF | 0.7883 | Others |
| NFAT5 | 0.1025 | 0.0721 | #96A900 | 0.7874 | Others |
| AJAP1 | -0.0441 | -0.0245 | #FE6D8D | 0.7868 | Others |
| DAAM1 | 0.1540 | -0.0127 | #00BF7D | 0.7863 | Others |
| SAP130 | -0.2497 | -0.5573 | #00C0BF | 0.7861 | Others |
| TP53INP2 | -0.2133 | -0.4661 | #1FB700 | 0.7857 | Others |
| TENM3 | 0.1768 | -0.7233 | #FF63B6 | 0.7835 | Neuronal |
| ARHGEF3 | -0.0482 | -0.9003 | #00BADE | 0.7825 | Others |
| NMNAT2 | -0.6814 | -0.9003 | #00BADE | 0.7819 | Others |
| CDC42EP3 | -0.3475 | -0.9003 | #00BADE | 0.7819 | Others |
| NCOA2 | 0.4695 | 0.5341 | #EF67EB | 0.7818 | Others |
| PCLO | -0.1874 | 0.0721 | #96A900 | 0.7812 | Others |
| AMPD3 | -0.2812 | -0.9003 | #00BADE | 0.7812 | Others |
| ASAP1 | -0.0917 | -0.3847 | #00B7E7 | 0.7810 | Others |
| AFAP1 | -0.1712 | 0.3428 | #00C08C | 0.7798 | Others |
| ZNF592 | 0.2727 | 0.3428 | #00C08C | 0.7794 | Others |
| KIAA1217 | -0.1881 | -0.9003 | #00BADE | 0.7785 | Others |
| PIEZO2 | -0.0776 | -0.6508 | #E76BF3 | 0.7784 | Others |
| DROSHA | -0.4103 | -0.5573 | #00C0BF | 0.7775 | Others |
| TET3 | 0.4848 | 0.5341 | #EF67EB | 0.7774 | Others |
| LRRC7 | 0.1828 | -0.9003 | #00BADE | 0.7770 | Others |
| GATAD2B | -0.4236 | -0.5573 | #00C0BF | 0.7757 | Others |
There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)
The pattern is stronger in under-expressed genes
negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() +
geom_smooth(method = 'loess', color = '#666666') +
geom_hline(yintercept = 0, color='gray', linetype='dashed') + xlab('Probability') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Probability and Gene Significance') + theme_minimal()
There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model
The model seems to assign slightly higher probabilities to genes belonging the modules with negative module-Dianosis correlations than to genes belonging to modules with positive ones
negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() +
geom_smooth(method='loess', color='#666666') +
geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) +
xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
theme_minimal()
The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same
The model seems to give lower probabilities to genes belonging to modules with a high (absolute) correlation to Diagnosis (this is unexpected)
plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>%
left_join(original_dataset, by='MTcor') %>%
dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'
ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) +
geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) +
xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())
There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias
We had already found some mean expression bias in the SFARI scores, but thought we had eliminated this information when constructing the WGCNA network (because correlation is independent to linear transformations), but a signal related to this seems to have survived into the network and the model is picking it up
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('Mean Expression') +
ylab('Probability') + ggtitle('Mean expression vs model probability by gene') +
theme_minimal()
rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob),
n=n())
ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) +
geom_point(color=plot_data2$Module, alpha = 0.6) +
geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) +
xlab('Mean Level of Expression') + ylab('Average Model Probability') +
theme_minimal() + theme(legend.position='none') +
ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)
There is a relation between probability and LFC, so it IS capturing a bit of true information (because LFC and mean expression were negatively correlated and it still has a positive relation in the model)
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')
plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
theme_minimal() + ggtitle('LFC vs model probability by gene')
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>%
ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) +
geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') +
ylim(c(min(plot_data$prob), max(plot_data$prob))) +
theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))
p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>%
ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) +
geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())
grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')
rm(p1, p2)
The bias using both SFARI Gene scoring systems is quite similar
load('./../Data/Ridge_model.RData')
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% dplyr::rename('old_prob' = prob) %>%
inner_join(predictions %>% dplyr::select(ID, prob), by = 'ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data %>% ggplot(aes(meanExpr, old_prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='#cc3399', fill='#cc3399', alpha=0.15) +
geom_smooth(method='loess', color='#ff9900', fill='#ff9900', alpha=0.15, aes(y=prob)) +
xlab('Mean Expression') + ylab('Probability') +
ggtitle('Bias with old SFARI Genes (pink) vs with New SFARI Genes (orange)') +
theme_minimal()
rm(mean_and_sd)
The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)
There doesn’t seem to be a big difference between models using the Old SFARI Gene scoring system and the new one
predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))
save(predictions, dataset, fit, file='./../Data/Ridge_model_old_SFARI.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] DMwR_0.4.1 doParallel_1.0.15 iterators_1.0.12 foreach_1.5.0
## [5] kableExtra_1.1.0 expss_0.10.2 corrplot_0.84 MLmetrics_1.1.1
## [9] car_3.0-7 carData_3.0-3 ROCR_1.0-7 gplots_3.0.3
## [13] caret_6.0-86 lattice_0.20-41 polycor_0.7-10 biomaRt_2.40.5
## [17] ggpubr_0.2.5 magrittr_1.5 RColorBrewer_1.1-2 gridExtra_2.3
## [21] viridis_0.5.1 viridisLite_0.3.0 plotly_4.9.2 knitr_1.28
## [25] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4
## [29] readr_1.3.1 tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.2
## [33] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 RSQLite_2.2.0
## [3] AnnotationDbi_1.46.1 htmlwidgets_1.5.1
## [5] BiocParallel_1.18.1 pROC_1.16.2
## [7] munsell_0.5.0 codetools_0.2-16
## [9] miniUI_0.1.1.1 withr_2.2.0
## [11] colorspace_1.4-1 Biobase_2.44.0
## [13] highr_0.8 rstudioapi_0.11
## [15] stats4_3.6.3 ggsignif_0.6.0
## [17] TTR_0.23-6 labeling_0.3
## [19] GenomeInfoDbData_1.2.1 bit64_0.9-7
## [21] farver_2.0.3 vctrs_0.3.1
## [23] generics_0.0.2 ipred_0.9-9
## [25] xfun_0.12 R6_2.4.1
## [27] GenomeInfoDb_1.20.0 locfit_1.5-9.4
## [29] bitops_1.0-6 DelayedArray_0.10.0
## [31] assertthat_0.2.1 promises_1.1.0
## [33] scales_1.1.1 nnet_7.3-14
## [35] ggExtra_0.9 gtable_0.3.0
## [37] timeDate_3043.102 rlang_0.4.6
## [39] genefilter_1.66.0 splines_3.6.3
## [41] lazyeval_0.2.2 ModelMetrics_1.2.2.2
## [43] acepack_1.4.1 broom_0.5.5
## [45] checkmate_2.0.0 yaml_2.2.1
## [47] reshape2_1.4.4 abind_1.4-5
## [49] modelr_0.1.6 crosstalk_1.1.0.1
## [51] backports_1.1.8 quantmod_0.4.17
## [53] httpuv_1.5.2 Hmisc_4.4-0
## [55] tools_3.6.3 lava_1.6.7
## [57] ellipsis_0.3.1 BiocGenerics_0.30.0
## [59] Rcpp_1.0.4.6 plyr_1.8.6
## [61] base64enc_0.1-3 progress_1.2.2
## [63] zlibbioc_1.30.0 RCurl_1.98-1.2
## [65] prettyunits_1.1.1 rpart_4.1-15
## [67] zoo_1.8-8 S4Vectors_0.22.1
## [69] SummarizedExperiment_1.14.1 haven_2.2.0
## [71] cluster_2.1.0 fs_1.4.0
## [73] data.table_1.12.8 openxlsx_4.1.4
## [75] reprex_0.3.0 matrixStats_0.56.0
## [77] hms_0.5.3 mime_0.9
## [79] evaluate_0.14 xtable_1.8-4
## [81] XML_3.99-0.3 rio_0.5.16
## [83] jpeg_0.1-8.1 readxl_1.3.1
## [85] shape_1.4.4 IRanges_2.18.3
## [87] compiler_3.6.3 KernSmooth_2.23-17
## [89] crayon_1.3.4 htmltools_0.4.0
## [91] mgcv_1.8-31 later_1.0.0
## [93] Formula_1.2-3 geneplotter_1.62.0
## [95] lubridate_1.7.4 DBI_1.1.0
## [97] dbplyr_1.4.2 MASS_7.3-51.6
## [99] Matrix_1.2-18 cli_2.0.2
## [101] gdata_2.18.0 gower_0.2.1
## [103] GenomicRanges_1.36.1 pkgconfig_2.0.3
## [105] foreign_0.8-76 recipes_0.1.10
## [107] xml2_1.2.5 annotate_1.62.0
## [109] webshot_0.5.2 XVector_0.24.0
## [111] prodlim_2019.11.13 rvest_0.3.5
## [113] digest_0.6.25 rmarkdown_2.1
## [115] cellranger_1.1.0 htmlTable_1.13.3
## [117] curl_4.3 shiny_1.4.0.2
## [119] gtools_3.8.2 lifecycle_0.2.0
## [121] nlme_3.1-147 jsonlite_1.7.0
## [123] fansi_0.4.1 pillar_1.4.4
## [125] fastmap_1.0.1 httr_1.4.1
## [127] survival_3.1-12 xts_0.12-0
## [129] glue_1.4.1 zip_2.0.4
## [131] png_0.1-7 glmnet_3.0-2
## [133] bit_1.1-15.2 class_7.3-17
## [135] stringi_1.4.6 blob_1.2.1
## [137] DESeq2_1.24.0 latticeExtra_0.6-29
## [139] caTools_1.18.0 memoise_1.1.0